In [ ]:
#install numerapi
!pip install --upgrade pip
!pip install --upgrade numerapi
Collecting pip
  Using cached pip-22.0.4-py3-none-any.whl (2.1 MB)
ERROR: causalml 0.9.0 has requirement numpy<1.19.0,>=0.16.0, but you'll have numpy 1.19.1 which is incompatible.
ERROR: causalml 0.9.0 has requirement scipy==1.4.1, but you'll have scipy 1.6.0 which is incompatible.
Installing collected packages: pip
Successfully installed pip-22.0.4
Requirement already up-to-date: numerapi in /home/jacobstahl/.local/lib/python3.8/site-packages (2.11.0)
Requirement already satisfied, skipping upgrade: python-dateutil in /usr/lib/python3/dist-packages (from numerapi) (2.7.3)
Requirement already satisfied, skipping upgrade: requests in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (2.24.0)
Requirement already satisfied, skipping upgrade: pytz in /usr/lib/python3/dist-packages (from numerapi) (2019.3)
Requirement already satisfied, skipping upgrade: pandas>=1.1.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (1.2.1)
Requirement already satisfied, skipping upgrade: tqdm>=4.29.1 in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (4.51.0)
Requirement already satisfied, skipping upgrade: click>=7.0 in /usr/lib/python3/dist-packages (from numerapi) (7.0)
Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /home/jacobstahl/.local/lib/python3.8/site-packages (from requests->numerapi) (2020.11.8)
Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/lib/python3/dist-packages (from requests->numerapi) (2.8)
Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->numerapi) (3.0.4)
Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/lib/python3/dist-packages (from requests->numerapi) (1.25.8)
Requirement already satisfied, skipping upgrade: numpy>=1.16.5 in /home/jacobstahl/.local/lib/python3.8/site-packages (from pandas>=1.1.0->numerapi) (1.19.1)
In [ ]:
import pandas as pd
import numpy as np
import numerapi
import os

# make /data directory if it doesn't exist
if not os.path.exists("data"):
    os.mkdir("data")
    
# download data using numerapi
# https://pypi.org/project/numerapi/
napi = numerapi.NumerAPI(verbosity="info")
napi.download_dataset("numerai_training_data.parquet", "data/numerai_training_data.parquet")
2022-05-01 19:34:54,149 INFO numerapi.utils: target file already exists
2022-05-01 19:34:54,149 INFO numerapi.utils: download complete
In [ ]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")
In [ ]:
training_set.head()
Out[ ]:
era data_type feature_dichasial_hammier_spawner feature_rheumy_epistemic_prancer feature_pert_performative_hormuz feature_hillier_unpitied_theobromine feature_perigean_bewitching_thruster feature_renegade_undomestic_milord feature_koranic_rude_corf feature_demisable_expiring_millepede ... target_paul_20 target_paul_60 target_george_20 target_george_60 target_william_20 target_william_60 target_arthur_20 target_arthur_60 target_thomas_20 target_thomas_60
id
n003bba8a98662e4 0001 train 1.0 0.50 1.00 1.00 0.00 0.00 1.00 1.00 ... 0.25 0.25 0.25 0.00 0.166667 0.000000 0.166667 0.000000 0.166667 0.000000
n003bee128c2fcfc 0001 train 0.5 1.00 0.25 0.75 0.00 0.75 0.50 0.75 ... 1.00 1.00 1.00 1.00 0.833333 0.666667 0.833333 0.666667 0.833333 0.666667
n0048ac83aff7194 0001 train 0.5 0.25 0.75 0.00 0.75 0.00 0.75 0.75 ... 0.50 0.25 0.25 0.25 0.500000 0.333333 0.500000 0.333333 0.500000 0.333333
n00691bec80d3e02 0001 train 1.0 0.50 0.50 0.75 0.00 1.00 0.25 1.00 ... 0.50 0.50 0.50 0.50 0.666667 0.500000 0.500000 0.500000 0.666667 0.500000
n00b8720a2fdc4f2 0001 train 1.0 0.75 1.00 1.00 0.00 0.00 1.00 0.50 ... 0.50 0.50 0.50 0.50 0.666667 0.500000 0.500000 0.500000 0.666667 0.500000

5 rows × 1073 columns

In [ ]:
print("number of eras:", len(training_set.era.unique()))
print("number of rows:", len(training_set))

feature_names = [f for f in training_set.columns if "feature_" in f]

training_set = training_set.sample(100000) # subsample to speed up and save memory
number of eras: 574
number of rows: 2412105
In [ ]:
# train linear regression model with sklearn as a baseline
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(training_set[feature_names], training_set["target"])
Out[ ]:
LinearRegression()
In [ ]:
# download validation set for testing our models
napi.download_dataset("numerai_validation_data.parquet", "data/numerai_validation_data.parquet")

# load validation set
validation_set = pd.read_parquet("data/numerai_validation_data.parquet")
2022-05-01 19:35:05,998 INFO numerapi.utils: target file already exists
2022-05-01 19:35:05,999 INFO numerapi.utils: download complete
In [ ]:
predictions = linear_model.predict(validation_set[feature_names])

# validation correlation by era
validation_set["linear_prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["linear_prediction"], era["target"])[0, 1]
)

!pip install --upgrade plotly

import plotly.express as px

# plot era correlations bar graph, each bar is a correlation between prediction and target by era
fig = px.bar(era_correlations)
# y axis is correlation, x axis is era
fig.update_layout(title="Correlation between prediction and target by era")
fig.show(renderer="notebook")
Requirement already up-to-date: plotly in /home/jacobstahl/.local/lib/python3.8/site-packages (5.7.0)
Requirement already satisfied, skipping upgrade: tenacity>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly) (8.0.1)
Requirement already satisfied, skipping upgrade: six in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly) (1.15.0)
In [ ]:
# cumulative sum of era correlations
# era correlations are used to calculate returns
# taking the cumulative sum of era correlations can estimate the expected returns without compounding

cum_sum = np.cumsum(era_correlations)

fig = px.bar(cum_sum)
fig.update_layout(title="Cumulative sum of era correlations")
fig.show(renderer="notebook")
In [ ]:
# lets train a catboost model to see if it can beat the linear regression model
# https://catboost.ai/

!pip install --upgrade catboost
import catboost

cat_model = catboost.CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
)
cat_model.fit(training_set[feature_names], training_set["target"], verbose=False)
predictions = cat_model.predict(validation_set[feature_names])

# validation correlation by era
validation_set["cat_prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["cat_prediction"], era["target"])[0, 1]
)

import plotly.express as px
# plot era correlations bar graph

fig = px.bar(era_correlations)
# y axis is correlation, x axis is era
fig.update_layout(title="Correlation between prediction and target by era")
fig.show(renderer="notebook")
Requirement already up-to-date: catboost in /home/jacobstahl/.local/lib/python3.8/site-packages (1.0.5)
Requirement already satisfied, skipping upgrade: pandas>=0.24.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.2.1)
Requirement already satisfied, skipping upgrade: matplotlib in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (3.3.4)
Requirement already satisfied, skipping upgrade: graphviz in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (0.17)
Requirement already satisfied, skipping upgrade: plotly in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (5.7.0)
Requirement already satisfied, skipping upgrade: numpy>=1.16.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.19.1)
Requirement already satisfied, skipping upgrade: six in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.15.0)
Requirement already satisfied, skipping upgrade: scipy in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.6.0)
Requirement already satisfied, skipping upgrade: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas>=0.24.0->catboost) (2.7.3)
Requirement already satisfied, skipping upgrade: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas>=0.24.0->catboost) (2019.3)
Requirement already satisfied, skipping upgrade: cycler>=0.10 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied, skipping upgrade: pillow>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (7.2.0)
Requirement already satisfied, skipping upgrade: tenacity>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly->catboost) (8.0.1)
In [ ]:
# cumulative sum of era correlations
# we can see that the catboost model is better than the linear regression model
cum_sum = np.cumsum(era_correlations)

fig = px.bar(cum_sum)
fig.update_layout(title="Cumulative sum of era correlations")
fig.show(renderer="notebook")
In [ ]:
import torch
from torch import nn

# train 2 layer neural network to predict target
# https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

model = nn.Sequential(
    nn.Linear(len(feature_names), 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, 1),
)

model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # we use a lower learning rate
criterion = nn.MSELoss()

NUM_ITERATIONS = 300
BATCH_SIZE = 2048

# copy training set to CPU tensor
training_features = torch.tensor(training_set[feature_names].values).float()
training_target = torch.tensor(training_set["target"].values).float()

# copy validation set to CPU tensor
validation_features = torch.tensor(validation_set[feature_names].values, dtype=torch.float32)
validation_target = torch.tensor(validation_set["target"].values, dtype=torch.float32)

validation_corrs = []
best_model = model
record_validation_corr = -9999

# let each batch be a random sample of the training set
# let batchs be a list of batchs
batches = [np.random.choice(training_set.shape[0], size=BATCH_SIZE, replace=False) for _ in range(NUM_ITERATIONS)]

for iteration, batch in enumerate(batches):

    model.cuda()
    X, y = training_features[batch], training_target[batch]
    X = X.cuda()
    y = y.cuda()

    # reshape batch to fit model
    X = X.view(X.shape[0], -1)
    # forward pass
    outputs = model(X)
    # calculate loss
    loss = criterion(outputs.squeeze(), y.squeeze())
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    # update weights
    optimizer.step()

    # calculate mean era correlation in validation set
    model.cpu()
    predictions = model(torch.tensor(validation_set[feature_names].values, dtype=torch.float32)).detach().cpu().numpy()
    validation_set["prediction"] = predictions

    era_correlations = validation_set.groupby("era").apply(
        lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
    )

    validation_corrs.append(era_correlations.mean())

    print(f"iteration {iteration} loss {loss.item()} mean era correlation {era_correlations.mean()}", end="\r")

    if era_correlations.mean() > record_validation_corr:
        record_validation_corr = era_correlations.mean()
        best_model = model

    # reset model parameters if val corr doesn't improve, or its NaN
    # sometimes the model will get stuck when it starts training and fails to improve
    if iteration > 10 and era_correlations.mean() < 0 or era_correlations.mean() != era_correlations.mean():
        model = nn.Sequential(
            nn.Linear(len(feature_names), 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        criterion = nn.MSELoss()
iteration 299 loss 0.050988636910915375 mean era correlation 0.0145012277351259386
In [ ]:
fig = px.bar(validation_corrs)
fig.update_layout(title="Mean era correlation on validation set by iteration")
# y axis is mean era correlation, x axis is iteration
fig.update_xaxes(title="Iteration")
fig.update_yaxes(title="Mean era correlation")
fig.show(renderer="notebook")
In [ ]:
# calculate era correlation in validation set use the best model, compare it to the linear regression model and the catboost model with a line plot
best_model.cpu()
predictions = best_model(torch.tensor(validation_set[feature_names].values, dtype=torch.float32)).detach().cpu().numpy()
validation_set["nn_prediction"] = predictions

nn_era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["nn_prediction"], era["target"])[0, 1]
)

nn_cum_sum = np.cumsum(nn_era_correlations)

cat_model_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["cat_prediction"], era["target"])[0, 1]
)

cat_model_cum_sum = np.cumsum(cat_model_correlations)

linear_model_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["linear_prediction"], era["target"])[0, 1]
)

linear_model_cum_sum = np.cumsum(linear_model_correlations)

# ensamble nn and catboost models by normalizing the predictions of the two models
# add them together and calculate the mean correlation

normalized_nn_predictions = (validation_set["nn_prediction"] - validation_set["nn_prediction"].mean()) / validation_set["nn_prediction"].std()
normalized_cat_predictions = (validation_set["cat_prediction"] - validation_set["cat_prediction"].mean()) / validation_set["cat_prediction"].std()

validation_set["ensamble_prediction"] = (normalized_nn_predictions + normalized_cat_predictions) / 2

ensamble_era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["ensamble_prediction"], era["target"])[0, 1]
)

ensamble_cum_sum = np.cumsum(ensamble_era_correlations)
2022-05-01 21:12:36,471 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-01 21:12:36,471 INFO numexpr.utils: NumExpr defaulting to 8 threads.
In [ ]:
import plotly

fig = plotly.graph_objects.Figure()

# add nn, linear regression and catboost predictions, cum sum
fig.add_scatter(x=np.arange(len(nn_era_correlations)), y=nn_cum_sum, name="Neural Network")
fig.add_scatter(x=np.arange(len(cat_model_correlations)), y=cat_model_cum_sum, name="CatBoost")
fig.add_scatter(x=np.arange(len(linear_model_correlations)), y=linear_model_cum_sum, name="Linear Regression")
fig.add_scatter(x=np.arange(len(ensamble_era_correlations)), y=ensamble_cum_sum, name="Ensamble")

fig.update_layout(title="Cumulative Correlation between prediction and target by era")
fig.show(renderer="notebook")